Introduction

The material in this workshop is strongly influenced by the following free online materials:

Introduction to R

Variable Assignment

#assign function
assign('w', 0)
w
## [1] 0
#left arrow
x <- 1
x
## [1] 1
#equal
y = 2
y
## [1] 2
#right arrow
3 -> z
z
## [1] 3
w + x + y + z
## [1] 6

Data Structures

Atomic Vectors

Vectors are single dimension data collections of the same type.

#use `c` function to combine individual datum together in a vector
#character
char_vector <-  c('a', 'b', 'c', 'd')
char_vector
## [1] "a" "b" "c" "d"
#for numbers, you can get a vector range by using `:`
#integer
int_vector <- 1:10
int_vector
##  [1]  1  2  3  4  5  6  7  8  9 10
#combining int and char vectors returns a character
combo_int_char <- c(char_vector, int_vector)
typeof(combo_int_char)
## [1] "character"
combo_int_char
##  [1] "a"  "b"  "c"  "d"  "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"
#double
dbl_vector <- seq(1, 2, by=.2)
dbl_vector
## [1] 1.0 1.2 1.4 1.6 1.8 2.0
#logical/boolean
logical_vector <- c(FALSE, TRUE, F, T)
logical_vector
## [1] FALSE  TRUE FALSE  TRUE

Items can be extracted from vector using square bracket [index] notation. R is a 1 index language and the first item in most data structures is in position 1.

#first element of char_vector
char_vector[1]
## [1] "a"
#last element of int_vector (calculate the size of vector with `length()`)
int_vector[length(int_vector)]
## [1] 10

Matrices & Arrays

Matrices are two dimensional data collections of the same type.

matrix(1:10)
##       [,1]
##  [1,]    1
##  [2,]    2
##  [3,]    3
##  [4,]    4
##  [5,]    5
##  [6,]    6
##  [7,]    7
##  [8,]    8
##  [9,]    9
## [10,]   10
matrix(1:10, ncol = 2)
##      [,1] [,2]
## [1,]    1    6
## [2,]    2    7
## [3,]    3    8
## [4,]    4    9
## [5,]    5   10
myMatrix <- matrix(1:10, ncol = 2, byrow = T)
myMatrix
##      [,1] [,2]
## [1,]    1    2
## [2,]    3    4
## [3,]    5    6
## [4,]    7    8
## [5,]    9   10
matrix(sample(0:1, size = 100, replace = T), ncol = 10, nrow = 10)
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    0    0    0    0    1    0    1    1    0     0
##  [2,]    0    1    0    1    1    1    1    0    0     0
##  [3,]    0    0    0    0    0    1    1    0    0     1
##  [4,]    0    0    1    1    1    1    1    1    0     1
##  [5,]    0    0    0    0    1    1    0    0    1     1
##  [6,]    0    1    1    0    0    1    0    0    1     1
##  [7,]    1    1    0    1    0    0    1    0    0     1
##  [8,]    1    1    0    0    0    0    1    1    1     1
##  [9,]    0    1    1    0    0    1    1    0    1     0
## [10,]    1    0    0    1    1    0    0    1    0     0

Items can be retrieved from a matrix by using the square bracket [row, column] notation

#first row
myMatrix[1,]
## [1] 1 2
#first column
myMatrix[,1]
## [1] 1 3 5 7 9
#first row first column
myMatrix[1,1]
## [1] 1

Lists

Lists are the R objects which contain elements of different types like − numbers, strings, vectors and another list inside it. A list can also contain a matrix or a function as its elements.

myList <- list(
  char_vector,
  int_vector,
  dbl_vector,
  logical_vector,
  myMatrix
)

myList
## [[1]]
## [1] "a" "b" "c" "d"
## 
## [[2]]
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## [[3]]
## [1] 1.0 1.2 1.4 1.6 1.8 2.0
## 
## [[4]]
## [1] FALSE  TRUE FALSE  TRUE
## 
## [[5]]
##      [,1] [,2]
## [1,]    1    2
## [2,]    3    4
## [3,]    5    6
## [4,]    7    8
## [5,]    9   10

Items can be retrieved from a list by using double bracket notation [[index]].

myList[[1]]
## [1] "a" "b" "c" "d"

The indexes in a list can also be named. If they are named, then you can use dollar sign notation $name to retrieve an item from the list.

myNamedList <- list(
  "char" = char_vector,
  "int" = int_vector,
  "dbl" = dbl_vector,
  "lgl" = logical_vector,
  "mat" = myMatrix,
  "list" = myList
)

myNamedList$char
## [1] "a" "b" "c" "d"

Data Frames

A data frame is used for storing data tables. It is a list of vectors of equal length. It should resemble a typical table structure similar to an Excel worksheet.

myDF <- data.frame(
  char = c('a', 'b', 'c', 'd', 'e', 'f'),
  num = 1:6,
  lgl = rep(c(T, F), 3)
)

myDF
##   char num   lgl
## 1    a   1  TRUE
## 2    b   2 FALSE
## 3    c   3  TRUE
## 4    d   4 FALSE
## 5    e   5  TRUE
## 6    f   6 FALSE
myDF[1:3,'num']
## [1] 1 2 3
myDF$num[1:3]
## [1] 1 2 3
myDF[1:3, c('num', 'lgl')]
##   num   lgl
## 1   1  TRUE
## 2   2 FALSE
## 3   3  TRUE

Functions

Tidyverse

To explore functions from the tidyverse we will create two dataframes.

state_df <- data.frame(
  NAME = state.name,
  ABB = state.abb,
  AREA = state.area
)

head(state_df)
##         NAME ABB   AREA
## 1    Alabama  AL  51609
## 2     Alaska  AK 589757
## 3    Arizona  AZ 113909
## 4   Arkansas  AR  53104
## 5 California  CA 158693
## 6   Colorado  CO 104247
state_regions <- data.frame(
  NAME = c(state.name, 'District of Columbia'),
  REGION = c(as.character(state.region), 'South')
)

head(state_regions)
##         NAME REGION
## 1    Alabama  South
## 2     Alaska   West
## 3    Arizona   West
## 4   Arkansas  South
## 5 California   West
## 6   Colorado   West

magrittr::%>%

Of the Tidyverse packages, the only one we will explicitly load into our environment is magrittr

library(magrittr)

dplyr::filter

state_regions %>%
  dplyr::filter(REGION == 'Northeast')
##            NAME    REGION
## 1   Connecticut Northeast
## 2         Maine Northeast
## 3 Massachusetts Northeast
## 4 New Hampshire Northeast
## 5    New Jersey Northeast
## 6      New York Northeast
## 7  Pennsylvania Northeast
## 8  Rhode Island Northeast
## 9       Vermont Northeast
state_df %>%
  dplyr::filter(AREA >= 150000)
##         NAME ABB   AREA
## 1     Alaska  AK 589757
## 2 California  CA 158693
## 3      Texas  TX 267339
state_df %>%
  dplyr::filter(ABB %in% c("PA", "OH", "MD"))
##           NAME ABB  AREA
## 1     Maryland  MD 10577
## 2         Ohio  OH 41222
## 3 Pennsylvania  PA 45333

stringr::str_detect

stringr::str_detect(string = 'New York', pattern = 'New')
## [1] TRUE
state_df %>%
  dplyr::filter(stringr::str_detect(NAME, 'New'))
##            NAME ABB   AREA
## 1 New Hampshire  NH   9304
## 2    New Jersey  NJ   7836
## 3    New Mexico  NM 121666
## 4      New York  NY  49576

dplyr::x_join

state_df %>%
  dplyr::left_join(state_regions, by = 'NAME') %>%
  tail()
## Warning: Column `NAME` joining factors with different levels, coercing to
## character vector
##             NAME ABB  AREA        REGION
## 45       Vermont  VT  9609     Northeast
## 46      Virginia  VA 40815         South
## 47    Washington  WA 68192          West
## 48 West Virginia  WV 24181         South
## 49     Wisconsin  WI 56154 North Central
## 50       Wyoming  WY 97914          West
state_df %>%
  dplyr::full_join(state_regions, by = 'NAME') %>%
  tail()
## Warning: Column `NAME` joining factors with different levels, coercing to
## character vector
##                    NAME  ABB  AREA        REGION
## 46             Virginia   VA 40815         South
## 47           Washington   WA 68192          West
## 48        West Virginia   WV 24181         South
## 49            Wisconsin   WI 56154 North Central
## 50              Wyoming   WY 97914          West
## 51 District of Columbia <NA>    NA         South
state_regions %>%
  dplyr::anti_join(state_df, by = 'NAME')
## Warning: Column `NAME` joining factors with different levels, coercing to
## character vector
##                   NAME REGION
## 1 District of Columbia  South

purrr::map_x

iterates through the rows of a table.

state_df$AREA %>%
  head() %>%
  purrr::map(sqrt)
## [[1]]
## [1] 227.1761
## 
## [[2]]
## [1] 767.9564
## 
## [[3]]
## [1] 337.5041
## 
## [[4]]
## [1] 230.4431
## 
## [[5]]
## [1] 398.3629
## 
## [[6]]
## [1] 322.873
state_df$AREA %>%
  head() %>%
  purrr::map_dbl(sqrt)
## [1] 227.1761 767.9564 337.5041 230.4431 398.3629 322.8730
state_df %>%
  head() %>%
  {
    purrr::map2_chr(.$NAME, .$ABB, function(x, y){
      paste(x, 'is', y)
    })
  }
## [1] "Alabama is AL"    "Alaska is AK"     "Arizona is AZ"   
## [4] "Arkansas is AR"   "California is CA" "Colorado is CO"
state_df %>%
  head() %>%
  {
    purrr::pmap_chr(., function(NAME, ABB, AREA){
      paste(NAME, 'or', ABB, 'has an area of', AREA )
    })
  }
## [1] "Alabama or AL has an area of 51609"    
## [2] "Alaska or AK has an area of 589757"    
## [3] "Arizona or AZ has an area of 113909"   
## [4] "Arkansas or AR has an area of 53104"   
## [5] "California or CA has an area of 158693"
## [6] "Colorado or CO has an area of 104247"

dplyr::group_by & tidyr::nest

nested_regions <- state_regions %>%
  dplyr::group_by(REGION) %>%
  tidyr::nest()

nested_regions
## # A tibble: 4 x 2
##   REGION        data             
##   <fct>         <list>           
## 1 South         <tibble [17 × 1]>
## 2 West          <tibble [13 × 1]>
## 3 Northeast     <tibble [9 × 1]> 
## 4 North Central <tibble [12 × 1]>
new_regions <- nested_regions %>%
  dplyr::mutate(n_new_in_region = purrr::map_dbl(data, function(d){
    d %>%
      dplyr::filter(stringr::str_detect(NAME, 'New')) %>%
      nrow()
  }))

new_regions
## # A tibble: 4 x 3
##   REGION        data              n_new_in_region
##   <fct>         <list>                      <dbl>
## 1 South         <tibble [17 × 1]>               0
## 2 West          <tibble [13 × 1]>               1
## 3 Northeast     <tibble [9 × 1]>                3
## 4 North Central <tibble [12 × 1]>               0
new_regions %>%
  tidyr::unnest() %>%
  dplyr::arrange(desc(n_new_in_region))
## # A tibble: 51 x 3
##    REGION    n_new_in_region NAME         
##    <fct>               <dbl> <fct>        
##  1 Northeast               3 Connecticut  
##  2 Northeast               3 Maine        
##  3 Northeast               3 Massachusetts
##  4 Northeast               3 New Hampshire
##  5 Northeast               3 New Jersey   
##  6 Northeast               3 New York     
##  7 Northeast               3 Pennsylvania 
##  8 Northeast               3 Rhode Island 
##  9 Northeast               3 Vermont      
## 10 West                    1 Alaska       
## # … with 41 more rows

stringr::str_replace

stringr::str_replace(string = 'Philadelphia Metro Area', pattern = ' Metro Area', replacement = '')
## [1] "Philadelphia"

Simple Features

Introduction

library(sf)
library(spData)

Spatial Types

point_mat <- matrix(c(1, 3, 5, 1, 4, 1), ncol = 2)
point_mat
##      [,1] [,2]
## [1,]    1    1
## [2,]    3    4
## [3,]    5    1
multipoint = st_multipoint(point_mat)
multipoint
## MULTIPOINT (1 1, 3 4, 5 1)
plot(multipoint, cex = 2.5)

linestring = st_cast(multipoint, "LINESTRING")
linestring
## LINESTRING (1 1, 3 4, 5 1)
plot(linestring)

polyg = st_cast(multipoint, "POLYGON")
polyg
## POLYGON ((1 1, 3 4, 5 1, 1 1))
plot(polyg, col = 'lightgrey', border = 'purple', lwd = 4)

st_cast(polyg, 'MULTIPOINT')
## MULTIPOINT (1 1, 3 4, 5 1)
hole_points <- list(
  matrix(c(1,1, 3,4, 5,1, 1,1), ncol = 2, byrow = T),
  matrix(c(2,2, 3,3, 4,2, 2,2), ncol = 2, byrow = T),
  matrix(c(2.25,2.25, 3,2.75, 3.75,2.25, 2.25,2.25), ncol=2, byrow = T),
  matrix(c(1,3, 1,4, 2,3.5, 1,3), ncol = 2, byrow =T),
  matrix(c(3,3, 5,4, 5,2, 3,3), ncol = 2, byrow =T)
)

holy_polygon <- st_polygon(hole_points)
holy_polygon
## POLYGON ((1 1, 3 4, 5 1, 1 1), (2 2, 3 3, 4 2, 2 2), (2.25 2.25, 3 2.75, 3.75 2.25, 2.25 2.25), (1 3, 1 4, 2 3.5, 1 3), (3 3, 5 4, 5 2, 3 3))
plot(holy_polygon, col = 'lightgrey')

multi_poly <- st_cast(holy_polygon, 'MULTIPOLYGON')
plot(multi_poly, col = 'lightgrey')

Spatial Predicates

# create a polygon
a_poly = st_polygon(list(rbind(c(-1, -1), c(1, -1), c(1, 1), c(-1, -1))))
a = st_sfc(a_poly)

# create a line
l_line = st_linestring(x = matrix(c(-1, 1, 1, -1), ncol = 2))
l = st_sfc(l_line)

# create points
p_matrix = matrix(c(0.5, 1, -1, 0, 0, 1, 1, 1), ncol = 2)
p_multi = st_multipoint(x = p_matrix)
p = st_sf(st_cast(st_sfc(p_multi), "POINT"))

plot(a)
plot(p, add = T, pch = 19)
plot(l, add = T)

sf::st_within

p_within <- p %>%
  dplyr::filter(st_within(., a, sparse = F))

plot(a)
plot(p_within, add = T, pch = 19)

sf::st_touches

p_touches <- p %>%
  dplyr::filter(st_touches(., a, sparse = F))

plot(a)
plot(p_touches, add = T, pch = 19)

sf::st_intersects

p_intersects <- p %>%
  dplyr::filter(st_intersects(., a, sparse = F))

plot(a)
plot(p_intersects, add = T, pch = 19)

sf::st_disjoint

p_disjoint <- p %>%
  dplyr::filter(st_disjoint(., a, sparse = F))

plot(a)
plot(p_disjoint, add = T, pch = 19)

sf::st_is_within_distances

p_withindist <- p %>%
  dplyr::filter(st_is_within_distance(., a, dist = 1, sparse = F))

plot(a)
plot(p_withindist, add = T, pch = 19)

p_close_but_not_inside <- p %>%
  dplyr::filter(st_is_within_distance(., a, dist = 1, sparse = F),
                st_disjoint(., a, sparse = F))  

plot(a)
plot(p_close_but_not_inside, add = T, pch = 19)

sf::st_union

line_poly <- st_union(l, a)

p_touches <- p %>%
  dplyr::filter(st_touches(., line_poly, sparse = F))

line_poly %>%
  st_cast %>%
  plot()
plot(p_touches, add = T)

I/O & Data Cleaning

us_states <- st_read('data/cb_2017_us_state_500k/cb_2017_us_state_500k.shp')
## Reading layer `cb_2017_us_state_500k' from data source `/Users/ben/GEODEV2019/data/cb_2017_us_state_500k/cb_2017_us_state_500k.shp' using driver `ESRI Shapefile'
## Simple feature collection with 56 features and 9 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -179.1489 ymin: -14.5487 xmax: 179.7785 ymax: 71.36516
## epsg (SRID):    4269
## proj4string:    +proj=longlat +datum=NAD83 +no_defs
head(us_states)
## Simple feature collection with 6 features and 9 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -117.243 ymin: 36.9703 xmax: -71.46455 ymax: 49.00115
## epsg (SRID):    4269
## proj4string:    +proj=longlat +datum=NAD83 +no_defs
##   STATEFP  STATENS    AFFGEOID GEOID STUSPS          NAME LSAD
## 1      54 01779805 0400000US54    54     WV West Virginia   00
## 2      17 01779784 0400000US17    17     IL      Illinois   00
## 3      24 01714934 0400000US24    24     MD      Maryland   00
## 4      16 01779783 0400000US16    16     ID         Idaho   00
## 5      50 01779802 0400000US50    50     VT       Vermont   00
## 6      09 01779780 0400000US09    09     CT   Connecticut   00
##          ALAND     AWATER                       geometry
## 1  62265662566  489840834 MULTIPOLYGON (((-82.6432 38...
## 2 143784114293 6211277447 MULTIPOLYGON (((-91.51297 4...
## 3  25150696145 6980371026 MULTIPOLYGON (((-76.05015 3...
## 4 214048160737 2393355752 MULTIPOLYGON (((-117.2427 4...
## 5  23873457570 1031134839 MULTIPOLYGON (((-73.43774 4...
## 6  12542619303 1815495323 MULTIPOLYGON (((-72.76143 4...
plot(us_states$geometry) 

conus <- us_states %>%
  dplyr::filter(!NAME %in% c("Alaska",
                             "American Samoa",
                             "Commonwealth of the Northern Mariana Islands",
                             "Hawaii",
                             "Guam",
                             "Puerto Rico",
                             "United States Virgin Islands"
                             ))

plot(conus$geometry)

conus_outline <- st_union(conus$geometry)
conus_outline
## Geometry set for 1 feature 
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -124.7631 ymin: 24.5231 xmax: -66.9499 ymax: 49.38436
## epsg (SRID):    4269
## proj4string:    +proj=longlat +datum=NAD83 +no_defs
## MULTIPOLYGON (((-82.00977 24.53491, -82.00931 2...
plot(conus_outline)

amtrak <- st_read('data/amtrak_rails/amtrak_rails.shp')
## Reading layer `amtrak_rails' from data source `/Users/ben/GEODEV2019/data/amtrak_rails/amtrak_rails.shp' using driver `ESRI Shapefile'
## Simple feature collection with 46 features and 4 fields
## geometry type:  MULTILINESTRING
## dimension:      XY
## bbox:           xmin: -123.2017 ymin: 25.7818 xmax: -69.96816 ymax: 49.24535
## epsg (SRID):    4269
## proj4string:    +proj=longlat +datum=NAD83 +no_defs
amtrak
## Simple feature collection with 46 features and 4 fields
## geometry type:  MULTILINESTRING
## dimension:      XY
## bbox:           xmin: -123.2017 ymin: 25.7818 xmax: -69.96816 ymax: 49.24535
## epsg (SRID):    4269
## proj4string:    +proj=longlat +datum=NAD83 +no_defs
## First 10 features:
##    OBJECTID                NAME Shape_Leng ShapeSTLen
## 1         1               Acela   741010.5   977920.7
## 2         2          Adirondack   615675.4   843749.7
## 3         3          Auto Train  1474023.7  1776111.7
## 4         4          Blue Water   511438.4   693922.3
## 5         5   California Zephyr  4313889.5  5672841.5
## 6         6     Capitol Limited  1287416.4  1703069.5
## 7         7            Cardinal  1865049.5  2409211.1
## 8         8 Carolinian/Piedmont  1166213.4  1473517.0
## 9         9            Cascades   755629.4  1105895.4
## 10       10  Chicago - St.Louis   454576.7   595026.6
##                          geometry
## 1  MULTILINESTRING ((-77.01421...
## 2  MULTILINESTRING ((-73.74197...
## 3  MULTILINESTRING ((-81.3177 ...
## 4  MULTILINESTRING ((-87.6361 ...
## 5  MULTILINESTRING ((-108.5559...
## 6  MULTILINESTRING ((-77.00406...
## 7  MULTILINESTRING ((-80.87038...
## 8  MULTILINESTRING ((-80.82749...
## 9  MULTILINESTRING ((-123.0638...
## 10 MULTILINESTRING ((-90.20919...
plot(amtrak$geometry)

plot(conus_outline)
plot(amtrak$geometry, add = T)

state_regions <- data.frame(
  NAME = c(state.name, "District of Columbia"),
  REGION = c(as.character(state.region), "South")
)

head(state_regions)
##         NAME REGION
## 1    Alabama  South
## 2     Alaska   West
## 3    Arizona   West
## 4   Arkansas  South
## 5 California   West
## 6   Colorado   West
conus_w_region <- conus %>%
  dplyr::left_join(state_regions)
## Joining, by = "NAME"
## Warning: Column `NAME` joining factors with different levels, coercing to
## character vector
head(conus_w_region)
## Simple feature collection with 6 features and 10 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -117.243 ymin: 36.9703 xmax: -71.46455 ymax: 49.00115
## epsg (SRID):    4269
## proj4string:    +proj=longlat +datum=NAD83 +no_defs
##   STATEFP  STATENS    AFFGEOID GEOID STUSPS          NAME LSAD
## 1      54 01779805 0400000US54    54     WV West Virginia   00
## 2      17 01779784 0400000US17    17     IL      Illinois   00
## 3      24 01714934 0400000US24    24     MD      Maryland   00
## 4      16 01779783 0400000US16    16     ID         Idaho   00
## 5      50 01779802 0400000US50    50     VT       Vermont   00
## 6      09 01779780 0400000US09    09     CT   Connecticut   00
##          ALAND     AWATER        REGION                       geometry
## 1  62265662566  489840834         South MULTIPOLYGON (((-82.6432 38...
## 2 143784114293 6211277447 North Central MULTIPOLYGON (((-91.51297 4...
## 3  25150696145 6980371026         South MULTIPOLYGON (((-76.05015 3...
## 4 214048160737 2393355752          West MULTIPOLYGON (((-117.2427 4...
## 5  23873457570 1031134839     Northeast MULTIPOLYGON (((-73.43774 4...
## 6  12542619303 1815495323     Northeast MULTIPOLYGON (((-72.76143 4...
cbsa <- read_sf('data/cb_2017_us_cbsa_500k/cb_2017_us_cbsa_500k.shp')
head(cbsa)
## Simple feature collection with 6 features and 8 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -98.95394 ymin: 32.84464 xmax: -79.4987 ymax: 43.8495
## epsg (SRID):    4269
## proj4string:    +proj=longlat +datum=NAD83 +no_defs
## # A tibble: 6 x 9
##   CSAFP CBSAFP AFFGEOID GEOID NAME  LSAD    ALAND AWATER
##   <chr> <chr>  <chr>    <chr> <chr> <chr>   <dbl>  <dbl>
## 1 <NA>  43620  310M300… 43620 Siou… M1    6.67e 9 2.79e7
## 2 357   12660  310M300… 12660 Bara… M2    2.15e 9 4.57e7
## 3 <NA>  40220  310M300… 40220 Roan… M1    4.84e 9 7.25e7
## 4 <NA>  48660  310M300… 48660 Wich… M1    6.78e 9 1.44e8
## 5 290   22840  310M300… 22840 Fort… M2    2.01e 9 4.12e6
## 6 122   12060  310M300… 12060 Atla… M1    2.25e10 3.93e8
## # … with 1 more variable: geometry <MULTIPOLYGON [°]>
plot(cbsa$geometry)

conus_cbsa <- cbsa %>%
  # dplyr::mutate(STUSPS = stringr::str_extract(NAME, '(?<=, )[^,]+$')) %>% 
  dplyr::filter(st_intersects(., conus_outline, sparse = F))
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
plot(conus_cbsa$geometry)

amtrak_cbsa <- conus_cbsa %>%
  dplyr::filter(st_intersects(., amtrak, sparse = T) %>%
                  purrr::map_lgl(function(x){
                    !identical(x, integer(0))
                  }))
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
plot(conus_outline)
plot(amtrak$geometry, add = T)
plot(amtrak_cbsa$geometry, add = T)

conus_regions <- conus_w_region %>%
  dplyr::group_by(REGION) %>%
  tidyr::nest() %>%
  {
    purrr::map2(.$data, .$REGION, function(d, r){
    d  %>%
      st_union() %>%  #unions all features in subregion
      st_sf() %>% #converts to sf (data frame type object)
      dplyr::mutate(REGION = r) #remember the name of the region
      }
  )} %>%
  do.call(rbind, .)

conus_regions
## Simple feature collection with 4 features and 1 field
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -124.7631 ymin: 24.5231 xmax: -66.9499 ymax: 49.38436
## epsg (SRID):    4269
## proj4string:    +proj=longlat +datum=NAD83 +no_defs
##          REGION                       geometry
## 1         South MULTIPOLYGON (((-82.00977 2...
## 2 North Central MULTIPOLYGON (((-83.48237 4...
## 3          West MULTIPOLYGON (((-118.6105 3...
## 4     Northeast MULTIPOLYGON (((-70.62678 4...
plot(conus_regions$geometry)

northeast <- conus_regions %>%
  dplyr::filter(REGION == 'Northeast') 

northeast_amtrak <- amtrak %>%
  dplyr::filter(st_intersects(., northeast, sparse = F))
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
northeast_amtrak_cbsa <- amtrak_cbsa %>%
  dplyr::filter(st_intersects(., northeast, sparse = F))
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
plot(northeast$geometry)
plot(northeast_amtrak_cbsa$geometry, col = 'lightgrey', add = T)
plot(northeast_amtrak$geometry, col = 'red', lwd = 1,  add = T)

census_data <- readr::read_csv('data/PEP_2017.csv')
## Parsed with column specification:
## cols(
##   NAME = col_character(),
##   GEOID = col_double(),
##   respop72017 = col_double()
## )
census_data
## # A tibble: 393 x 3
##    NAME                                         GEOID respop72017
##    <chr>                                        <dbl>       <dbl>
##  1 United States                                   NA   325719178
##  2 In metropolitan statistical area                NA   279698020
##  3 Abilene, TX Metro Area                       10180      170219
##  4 Akron, OH Metro Area                         10420      703505
##  5 Albany, GA Metro Area                        10500      151434
##  6 Albany, OR Metro Area                        10540      125047
##  7 Albany-Schenectady-Troy, NY Metro Area       10580      886188
##  8 Albuquerque, NM Metro Area                   10740      910726
##  9 Alexandria, LA Metro Area                    10780      153984
## 10 Allentown-Bethlehem-Easton, PA-NJ Metro Area 10900      840550
## # … with 383 more rows
census_data <- census_data %>%
  dplyr::mutate(NAME = stringr::str_replace(NAME, ' Metro Area$', ''),
                GEOID = as.character(GEOID)) 
amtrak_cbsa <- amtrak_cbsa %>%
  dplyr::left_join(census_data)
## Joining, by = c("GEOID", "NAME")
northeast_amtrak_cbsa <- northeast_amtrak_cbsa %>%
  dplyr::left_join(census_data)
## Joining, by = c("GEOID", "NAME")
plot(northeast$geometry)
plot(northeast_amtrak_cbsa['respop72017'], add = T)
plot(northeast_amtrak$geometry, col = 'red', lwd = 1,  add = T)

amtrak_points <- st_cast(northeast_amtrak, 'POINT')
## Warning in st_cast.sf(northeast_amtrak, "POINT"): repeating attributes for
## all sub-geometries for which they may not be constant
amtrak_lines <- st_cast(northeast_amtrak, 'LINESTRING')
## Warning in st_cast.sf(northeast_amtrak, "LINESTRING"): repeating attributes
## for all sub-geometries for which they may not be constant
plot(amtrak_points$geometry)

amtrak_cbsa_points <- northeast_amtrak_cbsa %>%
  dplyr::select(NAME, respop72017) %>%
  st_join(amtrak_points, .)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
amtrak_cbsa_lines <- northeast_amtrak_cbsa %>%
  dplyr::select(NAME, respop72017) %>%
  st_join(amtrak_lines, ., largest = T)
## although coordinates are longitude/latitude, st_intersection assumes that they are planar
## Warning: attribute variables are assumed to be spatially constant
## throughout all geometries
amtrak_cbsa_points
## Simple feature collection with 267797 features and 6 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: -90.11523 ymin: 25.84845 xmax: -69.96816 ymax: 45.50603
## epsg (SRID):    4269
## proj4string:    +proj=longlat +datum=NAD83 +no_defs
## First 10 features:
##     OBJECTID NAME.x Shape_Leng ShapeSTLen
## 1          1  Acela   741010.5   977920.7
## 1.1        1  Acela   741010.5   977920.7
## 1.2        1  Acela   741010.5   977920.7
## 1.3        1  Acela   741010.5   977920.7
## 1.4        1  Acela   741010.5   977920.7
## 1.5        1  Acela   741010.5   977920.7
## 1.6        1  Acela   741010.5   977920.7
## 1.7        1  Acela   741010.5   977920.7
## 1.8        1  Acela   741010.5   977920.7
## 1.9        1  Acela   741010.5   977920.7
##                                           NAME.y respop72017
## 1   Washington-Arlington-Alexandria, DC-VA-MD-WV     6216589
## 1.1 Washington-Arlington-Alexandria, DC-VA-MD-WV     6216589
## 1.2 Washington-Arlington-Alexandria, DC-VA-MD-WV     6216589
## 1.3 Washington-Arlington-Alexandria, DC-VA-MD-WV     6216589
## 1.4 Washington-Arlington-Alexandria, DC-VA-MD-WV     6216589
## 1.5 Washington-Arlington-Alexandria, DC-VA-MD-WV     6216589
## 1.6 Washington-Arlington-Alexandria, DC-VA-MD-WV     6216589
## 1.7 Washington-Arlington-Alexandria, DC-VA-MD-WV     6216589
## 1.8 Washington-Arlington-Alexandria, DC-VA-MD-WV     6216589
## 1.9 Washington-Arlington-Alexandria, DC-VA-MD-WV     6216589
##                       geometry
## 1    POINT (-77.01421 38.8836)
## 1.1 POINT (-77.01371 38.88349)
## 1.2 POINT (-77.01356 38.88345)
## 1.3 POINT (-77.01334 38.88342)
## 1.4 POINT (-77.01308 38.88339)
## 1.5 POINT (-77.01293 38.88339)
## 1.6 POINT (-77.01269 38.88338)
## 1.7 POINT (-77.01266 38.88338)
## 1.8 POINT (-77.01227 38.88341)
## 1.9 POINT (-77.01205 38.88343)
amtrak_cbsa_lines
## Simple feature collection with 322 features and 6 fields
## geometry type:  LINESTRING
## dimension:      XY
## bbox:           xmin: -90.11523 ymin: 25.84845 xmax: -69.96816 ymax: 45.50603
## epsg (SRID):    4269
## proj4string:    +proj=longlat +datum=NAD83 +no_defs
## First 10 features:
##     OBJECTID NAME.x Shape_Leng ShapeSTLen
## 1          1  Acela   741010.5   977920.7
## 1.1        1  Acela   741010.5   977920.7
## 1.2        1  Acela   741010.5   977920.7
## 1.3        1  Acela   741010.5   977920.7
## 1.4        1  Acela   741010.5   977920.7
## 1.5        1  Acela   741010.5   977920.7
## 1.6        1  Acela   741010.5   977920.7
## 1.7        1  Acela   741010.5   977920.7
## 1.8        1  Acela   741010.5   977920.7
## 1.9        1  Acela   741010.5   977920.7
##                                          NAME.y respop72017
## 1   Philadelphia-Camden-Wilmington, PA-NJ-DE-MD     6096120
## 1.1       New York-Newark-Jersey City, NY-NJ-PA    20320876
## 1.2       New York-Newark-Jersey City, NY-NJ-PA    20320876
## 1.3       New York-Newark-Jersey City, NY-NJ-PA    20320876
## 1.4       New York-Newark-Jersey City, NY-NJ-PA    20320876
## 1.5       New York-Newark-Jersey City, NY-NJ-PA    20320876
## 1.6       New York-Newark-Jersey City, NY-NJ-PA    20320876
## 1.7       New York-Newark-Jersey City, NY-NJ-PA    20320876
## 1.8       New York-Newark-Jersey City, NY-NJ-PA    20320876
## 1.9       New York-Newark-Jersey City, NY-NJ-PA    20320876
##                           geometry
## 1   LINESTRING (-77.01421 38.88...
## 1.1 LINESTRING (-74.1708 40.727...
## 1.2 LINESTRING (-74.1708 40.727...
## 1.3 LINESTRING (-74.15611 40.73...
## 1.4 LINESTRING (-74.11723 40.74...
## 1.5 LINESTRING (-74.115 40.7466...
## 1.6 LINESTRING (-74.115 40.7466...
## 1.7 LINESTRING (-73.93893 40.74...
## 1.8 LINESTRING (-73.91594 40.75...
## 1.9 LINESTRING (-73.91594 40.75...
plot(amtrak_cbsa_lines['respop72017'])
plot(northeast$geometry, add = T)

amtrak_cbsa_points <-  amtrak_cbsa_points %>%
  dplyr::select(rail = NAME.x,
                city = NAME.y,
                pop2017 = respop72017) %>%
  dplyr::mutate(rail = as.character(rail),
                city = as.character(city))
plot(northeast$geometry, col = 'grey90')
plot(northeast_amtrak$geometry, col = 'darkslategrey', lwd = 1, add = T)
plot(amtrak_cbsa_points['pop2017'], cex = .25, add = T)

Leaflet

library(leaflet)
center_conus <- st_centroid(conus_outline) %>%
  st_coordinates()
## Warning in st_centroid.sfc(conus_outline): st_centroid does not give
## correct centroids for longitude/latitude data
print('')
## [1] ""
center_conus
##           X        Y
## 1 -99.39245 39.39516
m <- leaflet() %>%
  setView(lng = center_conus[1],
          lat = center_conus[2],
          zoom = 3) %>%
  addTiles()  # Add default OpenStreetMap map tiles
  
m
m <- m %>%
  addPolygons(data = conus_regions, 
              color  = 'black', 
              weight = 1, 
              fillColor = 'white',
              fillOpacity = .1)
## Warning: sf layer has inconsistent datum (+proj=longlat +datum=NAD83 +no_defs).
## Need '+proj=longlat +datum=WGS84'
m
pal <- colorNumeric(
  palette = c("#edf8b1", "#2c7fb8"),
  domain = amtrak_cbsa$respop72017)

m <- m %>%
  addPolygons(data = amtrak_cbsa,  
              fillColor = ~pal(respop72017),
              fillOpacity = .8,
              color = 'black', 
              weight = 1,
              label = ~paste(NAME, "'s Expected Population for 2017 was", respop72017)
              ) %>%
  addLegend("bottomleft", 
            data = amtrak_cbsa,
            pal = pal, 
            values = ~respop72017,
            title = "Est. Population (2017)",
            opacity = 1
  )
## Warning: sf layer has inconsistent datum (+proj=longlat +datum=NAD83 +no_defs).
## Need '+proj=longlat +datum=WGS84'
m
m %>%
  addPolylines(data = amtrak,
               color = 'red', 
               weight = 3)
## Warning: sf layer has inconsistent datum (+proj=longlat +datum=NAD83 +no_defs).
## Need '+proj=longlat +datum=WGS84'

```